pacman::p_load(stringr,ggplot2, tidyr, ngram, dplyr, igraph, ggraph, visNetwork, tidygraph, graphlayouts,ggpubr, ggrepel, ggridges, viridis, network, reshape, tidytext)
setwd(getwd())
options(scipen = 999)
##read in data
imdb = read.csv("../data-raw/movie_metadata.csv", sep=";")
colSums(sapply(imdb, is.na))
## color director_name
## 0 0
## num_critic_for_reviews duration
## 50 15
## director_facebook_likes actor_3_facebook_likes
## 104 23
## actor_2_name actor_1_facebook_likes
## 0 7
## gross genres
## 884 0
## actor_1_name movie_title
## 0 0
## num_voted_users cast_total_facebook_likes
## 1 1
## actor_3_name facenumber_in_poster
## 0 13
## plot_keywords movie_imdb_link
## 0 0
## num_user_for_reviews language
## 22 0
## country content_rating
## 0 0
## budget title_year
## 493 109
## actor_2_facebook_likes imdb_score
## 14 1
## aspect_ratio movie_facebook_likes
## 330 1
Actors will be the nodes. Edges exist only if the actors have appeared in a movie together.
##extract the actors
actors <- imdb %>%
select(actor_1_name, actor_2_name, actor_3_name)
head(actors, 5)
## actor_1_name actor_2_name actor_3_name
## 1 CCH Pounder Joel David Moore Wes Studi
## 2 Johnny Depp Orlando Bloom Jack Davenport
## 3 Christoph Waltz Rory Kinnear Stephanie Sigman
## 4 Tom Hardy Christian Bale Joseph Gordon-Levitt
## 5 Doug Walker Rob Walker
actors <- actors %>%
filter(actor_1_name != "") %>%
filter(actor_2_name != "") %>%
filter(actor_3_name != "")
The nodelist will only contain each actor’s name once.
##make the nodelist
actor_nodes <- actors %>%
gather() %>%
select(value) %>%
distinct(value)
## Warning: attributes are not identical across measure variables;
## they will be dropped
head(actor_nodes, 5)
## value
## 1 CCH Pounder
## 2 Johnny Depp
## 3 Christoph Waltz
## 4 Tom Hardy
## 5 Daryl Sabara
Because each movie has three top actors given, some column manipulation is needed to format the data into the two “to” and “from” columns required for the edgelist.
##make the edgelist for actor_1 and actor_2
temp_edges_1_2 <- actors %>%
select(actor_1_name, actor_2_name) %>%
na.omit() %>%
dplyr::rename(from = actor_1_name, to = actor_2_name)
temp_edges_1_2[temp_edges_1_2==""] <- NA
temp_edges_1_2[temp_edges_1_2==" "] <- NA
temp_edges_1_2 <- temp_edges_1_2 %>%
na.omit()
head(temp_edges_1_2, 5)
## from to
## 1 CCH Pounder Joel David Moore
## 2 Johnny Depp Orlando Bloom
## 3 Christoph Waltz Rory Kinnear
## 4 Tom Hardy Christian Bale
## 5 Daryl Sabara Samantha Morton
## edgelist for actor_1 and actor_3
temp_edges_1_3 <- actors %>%
select(actor_1_name, actor_3_name) %>%
na.omit() %>%
dplyr::rename(from = actor_1_name) %>%
dplyr::rename(to = actor_3_name)
temp_edges_1_3[temp_edges_1_3==""] <- NA
temp_edges_1_3[temp_edges_1_3==" "] <- NA
##remove both values if there is even one NA present, eg Tom Hardy -> NA
temp_edges_1_3 <- temp_edges_1_3%>%
na.omit()
head(temp_edges_1_3, 5)
## from to
## 1 CCH Pounder Wes Studi
## 2 Johnny Depp Jack Davenport
## 3 Christoph Waltz Stephanie Sigman
## 4 Tom Hardy Joseph Gordon-Levitt
## 5 Daryl Sabara Polly Walker
## edgelist for actor_2 and actor_3
temp_edges_2_3 <- actors %>%
select(actor_2_name, actor_3_name) %>%
na.omit() %>%
dplyr::rename(from = actor_2_name) %>%
dplyr::rename(to = actor_3_name)
temp_edges_2_3[temp_edges_2_3==""] <- NA
temp_edges_2_3[temp_edges_2_3==" "] <- NA
temp_edges_2_3 <- temp_edges_2_3 %>%
na.omit()
head(temp_edges_2_3, 5)
## from to
## 1 Joel David Moore Wes Studi
## 2 Orlando Bloom Jack Davenport
## 3 Rory Kinnear Stephanie Sigman
## 4 Christian Bale Joseph Gordon-Levitt
## 5 Samantha Morton Polly Walker
##Combine the three sets of edges
actor_edges <- data.frame(from = "", to = "")
actor_edges <- do.call("rbind", list(temp_edges_1_2, temp_edges_1_3, temp_edges_2_3))
temp_edges_1_2 = NULL
temp_edges_1_3 = NULL
temp_edges_2_3 = NULL
head(actor_edges)
## from to
## 1 CCH Pounder Joel David Moore
## 2 Johnny Depp Orlando Bloom
## 3 Christoph Waltz Rory Kinnear
## 4 Tom Hardy Christian Bale
## 5 Daryl Sabara Samantha Morton
## 6 J.K. Simmons James Franco
##create the graph
actors_in_same_movies <- graph_from_data_frame(actor_edges, directed = F)
Here is a simple network plot of the constructed network.
Here a densely connected ‘hairball’ can be seen surrounded by many small nodes that are not connected to the main component.
##write to graphml for Gephi purposes
write.graph(actors_in_same_movies, "../data-out/graphs/actors_in_same_movies.graphml", format=c("graphml"))
##Weight edges instead of duplicate edges
casted_actors <- actor_edges %>%
mutate(val = 1) %>%
select(from, to, val) %>%
cast_sparse(row = from, column = to, value = val)
spread_graph <- graph_from_incidence_matrix(casted_actors)
projected <- bipartite.projection(spread_graph, which = "true")
Eigenvector centrality (also called eigencentrality) is a measure of the influence of a node in a network. It assigns relative scores to all nodes in the network based on the concept that connections to high-scoring nodes contribute more to the score of the node in question than equal connections to low-scoring nodes.
e_values <- readRDS(file="../data-out/g_eigen_values.RDs")
projected <- projected %>%
set_vertex_attr(name = "g_e_values", value = e_values$values)
e_values['values'] %>%
as.data.frame() %>%
ggplot()+
geom_density(aes(values)) +
xlab("Eigen value")
Eigen values are negatively skewed for global community 1. This means that most nodes are not connected to high scoring nodes.
In a connected graph, the normalized closeness centrality (or closeness) of a node is the average length of the shortest path between the node and all other nodes in the graph. Thus the more central a node is, the closer it is to all other nodes. (stolen from Wiki)
An actor will be well connected if other many actors can be reached in a short number of hops.
close_cent <- projected %>%
as_tbl_graph() %>%
activate(nodes) %>%
igraph::closeness()
var_cc <- mean(close_cent)
projected <- projected %>%
set_vertex_attr(name = "g_close_cent", value = close_cent)
close_cent %>%
as.data.frame() %>%
dplyr::rename(closeness = '.') %>%
ggplot()+
geom_density(aes(closeness)) +
xlab("Centrality (Closeness)")
So the closeness distribution is very interesting. There is a high number of nodes with a relatively high and relatively low closeness. This is due to the graph having many small components and one very densely connected large component. The mean value is 0.0000001.
Interpretively, the Boncich power measure corresponds to the notion that the power of a vertex is recursively defined by the sum of the power of its alters. The nature of the recursion involved is then controlled by the power exponent: positive values imply that vertices become more powerful as their alters become more powerful (as occurs in cooperative relations), while negative values imply that vertices become more powerful only as their alters become weaker (as occurs in competitive or antagonistic relations). (stolen from Wiki)
Essentially, the importance of an actor is defined by the importance of alters, or other connected actors.
power_cent <- projected %>% power_centrality(exponent = 0.9)
var_pc <- max(power_cent)
projected <- projected %>%
set_vertex_attr(name = "g_power_cent", value = power_cent)
power_cent %>%
as.data.frame() %>%
dplyr::rename(power = '.') %>%
mutate(power = as.numeric(power)) %>%
ggplot() +
geom_density(aes(x=power))
So the distribution of Boncich power is slightly positively skewed meaning that in general, vertices are considered more ‘powerful’ as their alters increase in power. The max power centrality is 14.8070446.
The PageRank algorithm ignores edge weights when calculating the importance of nodes. The more likely an actor will be found when randomly searching through movies, the higher the assigned PageRank.
page_ranks <- projected %>%
page_rank()
var_pr <- mean(page_ranks$vector)
projected <- projected %>%
set_vertex_attr(name = "g_page_rank", value = page_ranks$vector)
page_ranks$vector %>%
as.data.frame() %>%
dplyr::rename(page_r = '.') %>%
mutate(page_r = as.numeric(page_r)) %>%
ggplot() +
geom_density(aes(x=page_r)) +
xlab("Page Rank")
Most nodes have a relatively low page rank. The mean page rank is 0.0001855.
Group Louvain optimises for modularity in the network and therefore tries to create densely connected clusters with sparse connections between the clusters.
node_comms <- as_tbl_graph(projected) %>%
activate(nodes) %>%
mutate(global_comm = group_louvain(weights = weight)) %>%
as.data.frame()
projected <- projected %>%
set_vertex_attr("comm", value = node_comms$global_comm)
node_comms <- projected %>%
as_tbl_graph() %>%
as.data.frame()
node_comms %>%
ggplot() +
geom_bar(aes(x=comm))+
scale_y_log10()+
NULL
This graph shows the distribution of community size. The community size is exponentially distributed, resulting in a few large communities and many smaller ones. Some form of filtering on community size is needed to remove the smaller communities.
node_comms %>%
filter(comm < 55) %>%
ggplot() +
geom_bar(aes(x=comm))+
geom_vline(xintercept=c(17.5), linetype="dotted")+
NULL
After removing communities smaller than 100 actors, only 17 communities remain.
node_comms_filtered <- node_comms %>%
filter(comm <= 53)
projected <- set_vertex_attr(projected, name = "Label", value = projected$name)
## Warning in length(vattrs[[name]]) <- vc: length of NULL cannot be changed
The 17 remaining communities were analysed in gephi. The nodes are coloured and grouped by community, while the size of the node and text are dependent on the degree of the node.
It is clear that the remaining communities are very densely connected meaning that even after optimising for modularity, actors have many connections outside their community. These dense connections may have negatively impacted the results of the Group Louvain and there is concern as to the true modularity of these communities.
top_comms_nodes <- node_comms %>%
add_count(comm) %>%
arrange(name, desc(n)) %>%
group_by(comm) %>%
distinct %>%
top_n(5) %>%
ungroup() %>%
arrange(n) %>%
filter(n > 100) %>%
arrange(desc(n)) %>%
na.omit()
top_comms_nodes
## # A tibble: 3,958 x 7
## name g_e_values g_close_cent g_power_cent g_page_rank comm n
## <chr> <dbl> <dbl> <dbl> <dbl> <int> <int>
## 1 A.J. Langer -1. 0.000000140 -0.891 0.000126 1 355
## 2 Abigail Sp… 46.0 0.000000140 -0.0896 0.000253 1 355
## 3 Adam Butch… 39.1 0.000000140 -0.107 0.0000942 1 355
## 4 Adam Copel… -1. 0.000000140 0.0936 0.0000826 1 355
## 5 Aden Young 32.3 0.000000140 -1.07 0.000127 1 355
## 6 Adrian Paul 29.5 0.000000140 0.0936 0.0000826 1 355
## 7 Aidan McAr… 26.5 0.000000140 0.303 0.0000647 1 355
## 8 Aisha Tyler 25.2 0.000000140 -0.419 0.000103 1 355
## 9 Alan David -1. 0.000000140 -0.166 0.000107 1 355
## 10 Alex Jenni… -1. 0.000000140 -0.332 0.000112 1 355
## # … with 3,948 more rows
filtered_actor_edges <- subgraph(projected, top_comms_nodes$name)
g_comm_1_nodes <- top_comms_nodes %>%
filter(comm == 1)
g_comm_1_graph <- subgraph(projected, g_comm_1_nodes$name)
e_values_1 <- g_comm_1_graph %>%
as_adjacency_matrix(type="both") %>%
eigen()
g_comm_1_graph <- g_comm_1_graph %>%
set_vertex_attr(name = "local_e_values", value = e_values_1$values)
e_values_1['values'] %>%
as.data.frame() %>%
ggplot()+
geom_density(aes(values)) +
xlab("Eigen value")
Eigen values are negatively skewed for global community 1. This means that most nodes are not connected to high scoring nodes.
close_cent <- g_comm_1_graph %>%
closeness()
var <- mean(close_cent)
g_comm_1_graph <- g_comm_1_graph %>%
set_vertex_attr(name = "local_close_cent", value = close_cent)
close_cent %>%
as.data.frame() %>%
dplyr::rename(closeness = '.') %>%
ggplot()+
geom_density(aes(closeness)) +
xlab("Closeness centrality")
The average closeness centrality is 0.0009803. When looking at a single community we expect a higher average closeness than when calculating for the whole graph which was 0.0000001.
power_cent <- g_comm_1_graph %>% power_centrality(exponent = 0.9)
var <- max(power_cent)
g_comm_1_graph <- g_comm_1_graph %>%
set_vertex_attr(name = "local_power_cent", value = power_cent)
power_cent %>%
as.data.frame() %>%
dplyr::rename(power = '.') %>%
mutate(power = as.numeric(power)) %>%
ggplot() +
geom_density(aes(x=power))
The average Boncich power centrality is 8.7819488.
page_ranks <- g_comm_1_graph %>%
page_rank()
var <- mean(page_ranks$vector)
g_comm_1_graph <- g_comm_1_graph %>%
set_vertex_attr(name = "local_page_rank", value = page_ranks$vector)
page_ranks$vector %>%
as.data.frame() %>%
dplyr::rename(page_r = '.') %>%
mutate(page_r = as.numeric(page_r)) %>%
ggplot() +
geom_density(aes(x=page_r)) +
xlab("Page Rank")
The local mean page rank is 0.0028169, compared to the global mean of 0.0001855.
Creating the graph of centrality measures for community 1.
attributes <- vertex_attr(g_comm_1_graph)
g_comm_1_nodes <- data.frame(name = attributes['name'], comm = attributes['comm'], g_e_values=attributes['g_e_values'], g_close_cent=attributes['g_close_cent'], g_page_rank = attributes['g_page_rank'], g_power_cent = attributes['g_power_cent'], local_e_values = attributes['local_e_values'], local_page_rank = attributes['local_page_rank'], local_close_cent = attributes['local_power_cent'], local_close_cent = attributes['local_close_cent'], stringsAsFactors=FALSE)
This is the creation of the subgraph that will only contain vertices listed in community 2.
g_comm_2_nodes <- top_comms_nodes %>%
filter(comm == 2)
g_comm_2_graph <- subgraph(projected, g_comm_2_nodes$name)
e_values_2 <- g_comm_2_graph %>%
as_adjacency_matrix(type="both") %>%
eigen()
g_comm_2_graph <- g_comm_2_graph %>%
set_vertex_attr(name = "local_e_values", value = e_values_2$values)
e_values_2['values'] %>%
as.data.frame() %>%
ggplot()+
geom_density(aes(values)) +
xlab("Eigen value")
Eigen values are skewed to the right for global community 2. This means that most nodes are not connected to high scoring nodes.
##Comm Centrality
##Closeness
close_cent <- g_comm_2_graph %>%
closeness()
var <- mean(close_cent)
g_comm_2_graph <- g_comm_2_graph %>%
set_vertex_attr(name = "local_close_cent", value = close_cent)
close_cent %>%
as.data.frame() %>%
dplyr::rename(closeness = '.') %>%
ggplot()+
geom_density(aes(closeness)) +
xlab("Closeness centrality")
The mean closeness centrality is 0.000959.
power_cent <- g_comm_2_graph %>% power_centrality(exponent = 0.9)
var <- mean(power_cent)
g_comm_2_graph <- g_comm_2_graph %>%
set_vertex_attr(name = "local_power_cent", value = power_cent)
power_cent %>%
as.data.frame() %>%
dplyr::rename(power = '.') %>%
mutate(power = as.numeric(power)) %>%
ggplot() +
geom_density(aes(x=power))
The mean Boncich power centrality is -0.2283895.
##Page Rank
page_ranks <- g_comm_2_graph %>%
page_rank()
var <- mean(page_ranks$vector)
g_comm_2_graph <- g_comm_2_graph %>%
set_vertex_attr(name = "local_page_rank", value = page_ranks$vector)
page_ranks$vector %>%
as.data.frame() %>%
dplyr::rename(page_r = '.') %>%
mutate(page_r = as.numeric(page_r)) %>%
ggplot() +
geom_density(aes(x=page_r)) +
xlab("Page Rank")
The mean page rank for community 2 is 0.0028409.
Creating the graph of centrality measures for community 2.
attributes <- vertex_attr(g_comm_2_graph)
g_comm_2_nodes <- data.frame(name = attributes['name'], comm = attributes['comm'], g_e_values=attributes['g_e_values'], g_close_cent=attributes['g_close_cent'], g_page_rank = attributes['g_page_rank'], g_power_cent = attributes['g_power_cent'], local_e_values = attributes['local_e_values'], local_page_rank = attributes['local_page_rank'], local_close_cent = attributes['local_power_cent'], local_close_cent = attributes['local_close_cent'], stringsAsFactors=FALSE)
This is the creation of the subgraph that will only contain vertices listed in community 3.
g_comm_3_nodes <- top_comms_nodes %>%
filter(comm == 3)
g_comm_3_graph <- subgraph(projected, g_comm_3_nodes$name)
e_values_3 <- g_comm_3_graph %>%
as_adjacency_matrix(type="both") %>%
eigen()
g_comm_3_graph <- g_comm_3_graph %>%
set_vertex_attr(name = "local_e_values", value = e_values_3$values)
e_values_3['values'] %>%
as.data.frame() %>%
ggplot()+
geom_density(aes(values)) +
xlab("Eigen value")
Eigen values are skewed to the right for global community 3. This means that most nodes are not connected to high scoring nodes.
close_cent <- g_comm_3_graph %>%
closeness()
var <- mean(close_cent)
g_comm_3_graph <- g_comm_3_graph %>%
set_vertex_attr(name = "local_close_cent", value = close_cent)
close_cent %>%
as.data.frame() %>%
dplyr::rename(closeness = '.') %>%
ggplot()+
geom_density(aes(closeness)) +
xlab("Closeness centrality")
The mean closeness centrality is 0.0014738.
power_cent <- g_comm_3_graph %>% power_centrality(exponent = 0.9)
var <- mean(power_cent)
g_comm_3_graph <- g_comm_3_graph %>%
set_vertex_attr(name = "local_power_cent", value = power_cent)
power_cent %>%
as.data.frame() %>%
dplyr::rename(power = '.') %>%
mutate(power = as.numeric(power)) %>%
ggplot() +
geom_density(aes(x=power))
The mean Boncich power centrality in community 3 is -0.6025666.
page_ranks <- g_comm_3_graph %>%
page_rank()
var <- mean(page_ranks$vector)
g_comm_3_graph <- g_comm_3_graph %>%
set_vertex_attr(name = "local_page_rank", value = page_ranks$vector)
page_ranks$vector %>%
as.data.frame() %>%
dplyr::rename(page_r = '.') %>%
mutate(page_r = as.numeric(page_r)) %>%
ggplot() +
geom_density(aes(x=page_r)) +
xlab("Page Rank")
The mean page rank is 0.0030675.
Creating the graph of centrality measures for community 3.
attributes <- vertex_attr(g_comm_3_graph)
g_comm_3_nodes <- data.frame(name = attributes['name'], comm = attributes['comm'], g_e_values=attributes['g_e_values'], g_close_cent=attributes['g_close_cent'], g_page_rank = attributes['g_page_rank'], g_power_cent = attributes['g_power_cent'], local_e_values = attributes['local_e_values'], local_page_rank = attributes['local_page_rank'], local_close_cent = attributes['local_power_cent'], local_close_cent = attributes['local_close_cent'], stringsAsFactors=FALSE)
The top nodes from selected communities will be compared to see which measure is the best indicator of higher ratings.
## [1] "Morgan Freeman"
Morgan Freeman has the highest degree of any node in th graph and could therefore be seen as an influential node, however he may not be a central one. Morgan Freeman has acted with the greatest number of distinct actors according to the movies in this dataset.
## [1] "Highest global closeness centrality: Morgan Freeman"
Due to the high degree, it is not surprising that Morgan Freeman has the highest level of closeness centrality across the graph.
## [1] "Highest global Page rank: Morgan Freeman"
Morgan Freeman is considered the most important node by the PageRank algorithm
## [1] "Highest global Boncich power centrality: Matt Keeslar"
This actor himself is not considered the most influential however he has the most influential connections.
## [1] "Tom Wilkinson"
## [1] "Highest global closeness centrality: Tom Wilkinson"
## [1] "Highest local closeness centrality: Miranda Richardson"
## [1] "Highest global page rank: Tom Wilkinson"
## [1] "Highest local page rank: Tom Wilkinson"
## [1] "Highest global power centrality: R. Marcos Taylor"
## [1] "Highest local power centrality: Eric Sykes"
V(g_comm_2_graph)$name[degree(g_comm_2_graph)==max(degree(g_comm_2_graph))]
## [1] "Scarlett Johansson"
## [1] "Highest global closeness centrality: Kristin Scott Thomas"
## [1] "Highest global closeness centrality: Rachael Harris"
## [1] "Highest global Page rank: Steve Coogan"
## [1] "Highest local Page rank: Richard Schiff"
## [1] "Highest global Boncich power centrality: Gary Coleman"
## [1] "Highest local Boncich power centrality: Pamela Anderson"
The actors Gary Coleman and Pamela Anderson have the highest global and local Boncich power centrality, respectively. This means that across the whole graph (but limited to vertices in community 2), Gary Coleman has the most powerful connections while Pamela Anderson has the highest number of powerful connections within community 2.
## [1] "Morgan Freeman"
## [1] "Highest global closeness centrality: Morgan Freeman"
## [1] "Highest local closeness centrality: Morgan Freeman"
## [1] "Highest global Page rank: Morgan Freeman"
## [1] "Highest local Page rank: Morgan Freeman"
## [1] "Highest global Boncich power centrality: Tabu"
## [1] "Highest local Boncich power centrality: Charlize Theron"
The actors Tabu and Charlize Theron have the highest global and local Boncich power centrality, respectively. This means that across the whole graph (but limited to vertices in community 3), Tabu has the most powerful connections while Charlize Theron has the highest number of powerful connections within community 3.
Here the average rating of movies starred in for each actor is calculated.
average_imdb_actor_ratings <- imdb %>%
select(imdb_score, actor_1_name) %>%
dplyr::rename(actor = actor_1_name) %>%
group_by(actor) %>%
mutate(avg_rating = mean(imdb_score)) %>%
select(-imdb_score) %>%
distinct(actor, .keep_all = T) %>%
na.omit()
temp2 <- imdb %>%
select(imdb_score, actor_2_name) %>%
dplyr::rename(actor = actor_2_name) %>%
group_by(actor) %>%
mutate(avg_rating = mean(imdb_score)) %>%
select(-imdb_score) %>%
distinct(actor, .keep_all = T) %>%
na.omit()
temp3 <- imdb %>%
select(imdb_score, actor_3_name) %>%
dplyr::rename(actor = actor_3_name) %>%
group_by(actor) %>%
mutate(avg_rating = mean(imdb_score)) %>%
select(-imdb_score) %>%
distinct(actor, .keep_all = T) %>%
na.omit()
average_imdb_actor_ratings <- full_join(average_imdb_actor_ratings, temp2) %>%
group_by(actor) %>%
summarise(avg_rating = mean(avg_rating))
## Joining, by = c("actor", "avg_rating")
## Warning: Column `actor` joining factors with different levels, coercing to
## character vector
average_imdb_actor_ratings <- full_join(average_imdb_actor_ratings, temp3) %>%
group_by(actor) %>%
summarise(avg_rating = mean(avg_rating))
## Joining, by = c("actor", "avg_rating")
## Warning: Column `actor` joining character vector and factor, coercing into
## character vector
average_imdb_actor_ratings <- average_imdb_actor_ratings[-1,]
average_imdb_actor_ratings %>%
filter(actor == 'Morgan Freeman')
## # A tibble: 1 x 2
## actor avg_rating
## <chr> <dbl>
## 1 Morgan Freeman 7.76
Morgan Freeman has an average movie rating of 7.7605. In terms of the overall graph, this actor has the highest degree, closeness centrality and page rank.
average_imdb_actor_ratings %>%
filter(actor == 'Matt Keeslar')
## # A tibble: 1 x 2
## actor avg_rating
## <chr> <dbl>
## 1 Matt Keeslar 7
Matt Keeslar has an average movie rating of 7. In terms of the overall graph, this actor has the highest Boncich centrality meaning he has very influential alters.
average_imdb_actor_ratings %>%
filter(actor == 'Tom Wilkinson')
## # A tibble: 1 x 2
## actor avg_rating
## <chr> <dbl>
## 1 Tom Wilkinson 7.08
Tom Wilkinson has an average movie rating of 7.077083. In terms of community 1, this actor has the highest degree, global closeness centrality and global as well as local page rank.
average_imdb_actor_ratings %>%
filter(actor == 'Miranda Richardson')
## # A tibble: 1 x 2
## actor avg_rating
## <chr> <dbl>
## 1 Miranda Richardson 6.86
Miranda Richardson has an average movie rating of 6.855. In terms of community 1, this actor has the highest local closeness centrality meaning she is very central within community 1 but not overall in the graph.
average_imdb_actor_ratings %>%
filter(actor == 'R. Marcos Taylor')
## # A tibble: 1 x 2
## actor avg_rating
## <chr> <dbl>
## 1 R. Marcos Taylor 7.9
R. Marcos Taylor has an average movie rating of 7.9. In terms of community 1, this actor has the highest global Boncich centrality meaning that across the graph he has influential alters.
average_imdb_actor_ratings %>%
filter(actor == 'Eric Sykes')
## # A tibble: 1 x 2
## actor avg_rating
## <chr> <dbl>
## 1 Eric Sykes 7.6
Eric Sykes has an average movie rating of 7.6. In terms of community 1, this actor has the highest local Boncich centrality meaning that if only looking at community 1, Eric Sykes has the most influential alters.
average_imdb_actor_ratings %>%
filter(actor == 'Scarlett Johansson')
## # A tibble: 1 x 2
## actor avg_rating
## <chr> <dbl>
## 1 Scarlett Johansson 7.52
Scarlett Johansson has an average movie rating of 7.522159. In terms of community 2, this actor has the highest degree.
average_imdb_actor_ratings %>%
filter(actor == 'Kristin Scott Thomas')
## # A tibble: 1 x 2
## actor avg_rating
## <chr> <dbl>
## 1 Kristin Scott Thomas 6.94
Kristin Scott Thomas has an average movie rating of 6.939583. In terms of community 2, this actor has the highest global closeness centrality meaning she is very central overall in the graph but not the most central if only looking at community 2.
average_imdb_actor_ratings %>%
filter(actor == 'Rachael Harris')
## # A tibble: 1 x 2
## actor avg_rating
## <chr> <dbl>
## 1 Rachael Harris 6.21
Rachael Harris has an average movie rating of 6.208333. In terms of community 2, this actor has the highest local closeness centrality meaning she is very central within community 2 but not overall in the graph.
average_imdb_actor_ratings %>%
filter(actor == 'Steve Coogan')
## # A tibble: 1 x 2
## actor avg_rating
## <chr> <dbl>
## 1 Steve Coogan 6.29
Steve Coogan has an average movie rating of 6.2875. In terms of community 2, this actor has the highest global page rank.
average_imdb_actor_ratings %>%
filter(actor == 'Richard Schiff')
## # A tibble: 1 x 2
## actor avg_rating
## <chr> <dbl>
## 1 Richard Schiff 6.14
Richard Schiff has an average movie rating of 6.143333. In terms of community 2, this actor has the highest local page rank.
average_imdb_actor_ratings %>%
filter(actor == 'Gary Coleman')
## # A tibble: 1 x 2
## actor avg_rating
## <chr> <dbl>
## 1 Gary Coleman 6.15
Gary Coleman has an average movie rating of 6.15. In terms of community 2, this actor has the highest global Boncich centrality and has influential alters across the network.
average_imdb_actor_ratings %>%
filter(actor == 'Pamela Anderson')
## # A tibble: 1 x 2
## actor avg_rating
## <chr> <dbl>
## 1 Pamela Anderson 5.5
Pamela Anderson has an average movie rating of 5.5. In terms of community 2, this actor has the highest local Boncich centrality and has influential alters within community 2.
#####Community 3
average_imdb_actor_ratings %>%
filter(actor == 'Morgan Freeman')
## # A tibble: 1 x 2
## actor avg_rating
## <chr> <dbl>
## 1 Morgan Freeman 7.76
Morgan Freeman has an average movie rating of 7.7605. In terms of community 3, this actor has the highest degree, closeness centrality and page rank in terms of both local and global calculations.
average_imdb_actor_ratings %>%
filter(actor == 'Tabu')
## # A tibble: 1 x 2
## actor avg_rating
## <chr> <dbl>
## 1 Tabu 7.8
Tabu acts primarily in Hindi films and is the only actor highlighted not from Western films. The average movie rating is 7.8 and in terms of community 3, Tabu has the highest global Boncich centrality.
average_imdb_actor_ratings %>%
filter(actor == 'Charlize Theron')
## # A tibble: 1 x 2
## actor avg_rating
## <chr> <dbl>
## 1 Charlize Theron 6.59
Charlize Theron has an average movie rating of 6.586667 and has the highest local Boncich power centrality within community 3.
ratings_and_centrality <- average_imdb_actor_ratings %>%
dplyr::rename(name = actor) %>%
left_join(g_comm_1_nodes) %>%
left_join(g_comm_2_nodes) %>%
left_join(g_comm_3_nodes) %>%
na.omit()
## Joining, by = "name"
## Joining, by = c("name", "comm", "g_e_values", "g_close_cent", "g_page_rank", "g_power_cent", "local_e_values", "local_page_rank", "local_power_cent", "local_close_cent")
## Joining, by = c("name", "comm", "g_e_values", "g_close_cent", "g_page_rank", "g_power_cent", "local_e_values", "local_page_rank", "local_power_cent", "local_close_cent")
ratings_and_centrality %>%
ggplot() +
geom_smooth(aes(x = g_e_values, y = avg_rating), method = 'loess') +
xlab("Global Eigen values") +
ylab("Avg movie rating") +
ggtitle("Global Eigen values vs Average Movie rating")
ratings_and_centrality %>%
ggplot() +
geom_smooth(aes(x = g_close_cent, y = avg_rating), method = 'loess') +
xlab("Global Closeness centrality values") +
ylab("Avg movie rating") +
ggtitle("Global Closeness Centrality vs Average Movie rating")
ratings_and_centrality %>%
ggplot() +
geom_smooth(aes(x = g_page_rank, y = avg_rating), method = 'loess') +
xlab("Global Page Rank values") +
ylab("Avg movie rating") +
ggtitle("Global Page Rank vs Average Movie rating")
ratings_and_centrality %>%
ggplot() +
geom_smooth(aes(x = g_power_cent, y = avg_rating), method = 'loess') +
xlab("Global Power Centrality") +
ylab("Avg movie rating") +
ggtitle("Global Boncich Power Centrality vs Average Movie rating")
None of the graphs show any strong correlation between the global centrality and the average rating of the movie. It will now be explored whether using local centrality measures will produce a different outcome.
ratings_and_centrality %>%
ggplot() +
geom_smooth(aes(x = local_e_values, y = avg_rating), method = 'loess') +
xlab("Eigen vector Centrality") +
ylab("Avg movie rating") +
ggtitle("Local Eigen vector Centrality vs Average Movie rating")
ratings_and_centrality %>%
ggplot() +
geom_smooth(aes(x = local_close_cent, y = avg_rating), method = 'loess') +
xlab("Closeness Centrality") +
ylab("Avg movie rating") +
ggtitle("Local Closeness Centrality vs Average Movie rating")
ratings_and_centrality %>%
ggplot() +
geom_smooth(aes(x = local_page_rank, y = avg_rating), method = 'loess') +
xlab("Page rank") +
ylab("Avg movie rating") +
ggtitle("Local Page rank vs Average Movie rating")
ratings_and_centrality %>%
ggplot() +
geom_smooth(aes(x = local_power_cent, y = avg_rating), method = 'loess') +
xlab("Power Centrality") +
ylab("Avg movie rating") +
ggtitle("Local Boncich Power Centrality vs Average Movie rating")
The local centralities do not appear to have any correlation to the average movie rating.
It can be said that the centrality of nodes is not an idicator of success for movie ratings. The variance in movie ratings is relatively high for very central nodes as well as not as central nodes.